Set up environment and load data into R

setwd("~/Documents/R/NYPD_MVC/")
mvc.raw <- read.csv(file = "NYPD_Motor_Vehicle_Collisions.csv", header = T)

Get info of the data

  1. Use str() function to check data structure
str(mvc.raw)
## 'data.frame':    333117 obs. of  29 variables:
##  $ DATE                         : Factor w/ 559 levels "01/01/2015","01/01/2016",..: 351 80 80 80 80 80 80 80 80 80 ...
##  $ TIME                         : Factor w/ 1440 levels "0:00","0:01",..: 506 807 811 816 821 836 837 881 886 912 ...
##  $ BOROUGH                      : Factor w/ 6 levels "","BRONX","BROOKLYN",..: 1 3 4 1 1 5 1 1 1 2 ...
##  $ ZIP.CODE                     : int  NA 11210 10029 NA NA 11419 NA NA NA 10466 ...
##  $ LATITUDE                     : num  40.7 40.6 40.8 40.6 NA ...
##  $ LONGITUDE                    : num  -74 -74 -73.9 -74.1 NA ...
##  $ LOCATION                     : Factor w/ 51604 levels "","(40.4991346, -74.2434848)",..: 23187 9531 41437 5466 1 20818 1 1 1 50914 ...
##  $ ON.STREET.NAME               : Factor w/ 7275 levels "","?EST 125 STREET",..: 1 3349 2669 1 6424 149 3893 1032 1 2840 ...
##  $ CROSS.STREET.NAME            : Factor w/ 7843 levels "","01247","043 PCT",..: 1 3088 4720 1 6776 1026 46 2724 1 3348 ...
##  $ OFF.STREET.NAME              : Factor w/ 21951 levels "","(26 BROOKLYN TERMINAL MARKET LOT)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ NUMBER.OF.PERSONS.INJURED    : int  1 0 0 0 0 1 0 0 2 0 ...
##  $ NUMBER.OF.PERSONS.KILLED     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NUMBER.OF.PEDESTRIANS.INJURED: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NUMBER.OF.PEDESTRIANS.KILLED : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NUMBER.OF.CYCLIST.INJURED    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NUMBER.OF.CYCLIST.KILLED     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NUMBER.OF.MOTORIST.INJURED   : int  1 0 0 0 0 1 0 0 2 0 ...
##  $ NUMBER.OF.MOTORIST.KILLED    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CONTRIBUTING.FACTOR.VEHICLE.1: Factor w/ 48 levels "","Accelerator Defective",..: 46 46 22 43 46 34 35 46 15 6 ...
##  $ CONTRIBUTING.FACTOR.VEHICLE.2: Factor w/ 48 levels "","Accelerator Defective",..: 46 46 20 46 46 46 35 46 15 46 ...
##  $ CONTRIBUTING.FACTOR.VEHICLE.3: Factor w/ 41 levels "","Accelerator Defective",..: 40 1 1 1 1 1 1 1 40 1 ...
##  $ CONTRIBUTING.FACTOR.VEHICLE.4: Factor w/ 37 levels "","Accelerator Defective",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ CONTRIBUTING.FACTOR.VEHICLE.5: Factor w/ 24 levels "","Aggressive Driving/Road Rage",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ UNIQUE.KEY                   : int  3467644 3386830 3386649 3387085 3386424 3386935 3386475 3386473 3386684 3386719 ...
##  $ VEHICLE.TYPE.CODE.1          : Factor w/ 18 levels "","AMBULANCE",..: 10 10 10 14 15 10 10 15 10 15 ...
##  $ VEHICLE.TYPE.CODE.2          : Factor w/ 18 levels "","AMBULANCE",..: 10 15 16 10 1 10 15 15 14 15 ...
##  $ VEHICLE.TYPE.CODE.3          : Factor w/ 18 levels "","AMBULANCE",..: 10 1 1 1 1 1 1 1 10 1 ...
##  $ VEHICLE.TYPE.CODE.4          : Factor w/ 17 levels "","AMBULANCE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ VEHICLE.TYPE.CODE.5          : Factor w/ 14 levels "","BICYCLE","BUS",..: 1 1 1 1 1 1 1 1 1 1 ...

Clean up data

  1. Reassign attributes columns
mvc.raw$DATE <- as.Date(mvc.raw$DATE, format = "%m/%d/%Y")
  1. Subset data by lantitude and longitude
mvc.xy <- mvc.raw[!is.na(mvc.raw$LATITUDE) & !is.na(mvc.raw$LONGITUDE), ]
  1. Use complete.case() function to remove cases having any NA in its cells
mvc.del_NA <- mvc.raw[complete.cases(mvc.raw), ]

Plots

  1. Collisions incidents grouped by boroughs
library(ggplot2)
library(scales)
qplot(BOROUGH, data = mvc.del_NA, geom = "bar")

  1. Collisions incidents by dates
g2 <- qplot(DATE, data = mvc.del_NA, geom = "histogram", binwidth = 10, facets = . ~ BOROUGH)
g2 + scale_x_date(breaks = date_breaks('4 months'), labels = date_format("%Y-%m")) + 
  theme(axis.text.x = element_text(angle=90))

  1. Collisions incidents on the map (use leaflet package)
library(leaflet)
leaflet(mvc.xy) %>% addTiles() %>% addMarkers(clusterOptions = markerClusterOptions())